In [ ]:
!pip install -r /content/requirements.txt -q
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 485.9/485.9 kB 2.8 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 538.2/538.2 kB 8.6 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.1/12.1 MB 14.4 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 160.5/160.5 kB 11.6 MB/s eta 0:00:00 Preparing metadata (setup.py) ... done ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 258.0/258.0 kB 18.5 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 81.9/81.9 kB 6.2 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 13.1 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 79.9/79.9 MB 4.8 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 106.8/106.8 kB 9.7 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 80.7/80.7 kB 8.7 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.9/21.9 MB 41.1 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.0/44.0 kB 4.6 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 56.7 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 51.1 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.2/10.2 MB 50.7 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 144.8/144.8 kB 14.6 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 54.6 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 128.8/128.8 kB 10.4 MB/s eta 0:00:00 Building wheel for pyod (setup.py) ... done
In [ ]:
# native imports
import warnings
# utility imports
import pandas as pd
import numpy as np
# stats pkgs
import statsmodels.api as stat_model_api
import statsmodels as stat_model
# viz pkgs
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
# automl
import pycaret
from pycaret.regression import *
# preprocessing pkgs
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
In [ ]:
# defaults
%matplotlib inline
warnings.filterwarnings("ignore")
matplotlib.rcParams["figure.figsize"] = [16, 9]
np.random.seed(45)
data_scaler = MinMaxScaler()
Data Check¶
The purpose of data check is to
- Understand the nature of data
- spot missing data
- spot outliers
- identify correlations
- identify interactions effects
- identify transformation effects
In [ ]:
# load the data
data = pd.read_csv("/content/data.csv")
data.head()
Out[ ]:
| time | Comms and Services | Space Heating | Hot Water | Sockets | Lighting | Bld_EngCons | Car Chargers | weekend | bank holiday | ... | forecastperiod | forecast_temperature | forecast_feelslike | forecast_weathertype | forecast_windspeed | forecast_uvindex | forecast_precipitationprobability | forecast_winddirection | forecast_visibility | forecast_interval | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-04-01 00:00:00+00 | 0.515253 | 0.856489 | 0.000000 | 0.051467 | 0.124797 | 1.548006 | 0.032239 | False | False | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2019-04-01 01:00:00+00 | 0.687381 | 0.786147 | 0.085386 | 0.050931 | 0.151708 | 1.761553 | 0.042894 | False | False | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2019-04-01 02:00:00+00 | 0.687678 | 3.530669 | 0.099239 | 0.055706 | 0.151233 | 4.524525 | 0.043100 | False | False | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 2019-04-01 03:00:00+00 | 0.690139 | 4.044003 | 0.098467 | 0.050019 | 0.151436 | 5.034064 | 0.043131 | False | False | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2019-04-01 04:00:00+00 | 0.687081 | 4.223769 | 0.091533 | 0.050142 | 0.151331 | 5.203856 | 0.043031 | False | False | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 26 columns
In [ ]:
# make column names more code friendly
new_col_names_map = {i: i.lower().replace(" ", "_") for i in data.columns}
data = data.rename(columns=new_col_names_map)
data.head()
Out[ ]:
| time | comms_and_services | space_heating | hot_water | sockets | lighting | bld_engcons | car_chargers | weekend | bank_holiday | ... | forecastperiod | forecast_temperature | forecast_feelslike | forecast_weathertype | forecast_windspeed | forecast_uvindex | forecast_precipitationprobability | forecast_winddirection | forecast_visibility | forecast_interval | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-04-01 00:00:00+00 | 0.515253 | 0.856489 | 0.000000 | 0.051467 | 0.124797 | 1.548006 | 0.032239 | False | False | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2019-04-01 01:00:00+00 | 0.687381 | 0.786147 | 0.085386 | 0.050931 | 0.151708 | 1.761553 | 0.042894 | False | False | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2019-04-01 02:00:00+00 | 0.687678 | 3.530669 | 0.099239 | 0.055706 | 0.151233 | 4.524525 | 0.043100 | False | False | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 2019-04-01 03:00:00+00 | 0.690139 | 4.044003 | 0.098467 | 0.050019 | 0.151436 | 5.034064 | 0.043131 | False | False | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2019-04-01 04:00:00+00 | 0.687081 | 4.223769 | 0.091533 | 0.050142 | 0.151331 | 5.203856 | 0.043031 | False | False | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 26 columns
In [ ]:
# check data types for each columns
data.dtypes
Out[ ]:
time object comms_and_services float64 space_heating float64 hot_water float64 sockets float64 lighting float64 bld_engcons float64 car_chargers float64 weekend bool bank_holiday bool hour int64 day_of_week int64 day_of_month int64 month int64 year int64 forecast_datadate object forecastperiod object forecast_temperature float64 forecast_feelslike float64 forecast_weathertype float64 forecast_windspeed float64 forecast_uvindex float64 forecast_precipitationprobability float64 forecast_winddirection object forecast_visibility object forecast_interval object dtype: object
- Need to check and convert all the object columns
In [ ]:
# convert time mixed object column to datetime timestamp
data["time"] = pd.to_datetime(data["time"], format="%Y-%m-%d %H:%M:%S")
In [ ]:
# checking the range of the time period
data["time"].min(), data["time"].max()
Out[ ]:
(Timestamp('2019-04-01 00:00:00+0000', tz='UTC'),
Timestamp('2023-12-31 23:00:00+0000', tz='UTC'))
In [ ]:
# setting eda_data time as index for better slicing and parsing in pandas
data = data.set_index("time")
In [ ]:
sns.displot(
data=data.isna().melt(value_name='missing'),
y='variable',
hue='missing',
multiple='fill',
height=8,
aspect=1.1
)
# specifying a threshold value
plt.axvline(0.4, color='r')
plt.show("Na values in the data")
- The mssing data bar chart indicates how much data is missing, general alarming threshold is 40%
In [ ]:
msno.matrix(data)
plt.title("NA values in the data")
plt.show()
- The missing data chart above represents each sample on y axis and features on x axis, and general distribution per instance how many NA values are present.
In [ ]:
sns.pairplot(data)
plt.title("Pairplot")
plt.suptitle("To check data distribution and relationship with other variables")
plt.show()